0. Load libraries, scripts and data

### Libraries
library(magrittr)
library(ArchR)
library(RWireX)
library(ggpubr)
library(purrr)
library(Seurat)
library(parallel)
library(scDblFinder)
library(qs)
source("plotCellQuality.R")
source("plotUmapQC.R")
source("plotUmapClusters.R")
source("plotUmaps.R")
### Set ArchR parameters.
addArchRThreads(threads = 30) # number of parallel threads
addArchRGenome("hg38") # genome and gene annotation ("hg19", "hg38", "mm9", "mm10")
### Set HDF5 environment variables to prevent HDF5 error later
Sys.setenv(HDF5_USE_FILE_LOCKING=FALSE)
Sys.setenv(RHDF5_USE_FILE_LOCKING=FALSE)
### Samples
samples <- c("PBMC_scATAC", "PBMC_scTurboATAC_16", "PBMC_scTurboATAC_13")

### Sample palette
sample_palette <- c("grey", "blue", "darkblue")
names(sample_palette) <- samples
### Data
inputFiles <- c("PBMC_Multiome_10xTn5_CellRangerARC_fragments.tsv.gz",
                "PBMC_Multiome_Tn5hc_50perc_CellRangerARC_fragments.tsv.gz",
                "PBMC_Multiome_Tn5hc_CellRangerARC_fragments.tsv.gz")
names(inputFiles) <- samples

1. Create Arrow Files

overview
ArrowFiles <- createArrowFiles(
  inputFiles = inputFiles,
  sampleNames = names(inputFiles),
  minTSS = 0, #Dont set this too high because you can always increase later
  minFrags = 100,
  maxFrags = 5000000,
  addTileMat = TRUE,
  addGeneScoreMat = FALSE,
  verbose = FALSE, 
  subThreading = FALSE 
)
names(ArrowFiles) <- gsub("\\.arrow", "", ArrowFiles)

2. Create and preprocess ArchRProject

overview
proj_list <- list()

for (sample in samples){
    proj_list[[sample]] <- ArchRProject(
      ArrowFiles = ArrowFiles[[sample]], 
      outputDirectory = paste0("ArchRProj_", sample),
      copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
    )
}
proj <- ArchRProject(
      ArrowFiles = ArrowFiles, 
      outputDirectory = "ArchRProj_all",
      copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
    )

a) Quality Control

### Fragment Size distributions
fragsize_samples <- plotFragmentSizes(ArchRProj = proj, pal = sample_palette) + theme(plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=0), legend.title = element_text(size=0), legend.text = element_text(size=16)) +
ggtitle("Fragment size distribution")

### TSS enrichment profiles
tssprofiles_samples <- plotTSSEnrichment(ArchRProj = proj, pal = sample_palette) + theme(plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=0), legend.title = element_text(size=0), legend.text = element_text(size=16)) +
ggtitle("TSS enrichment profile")
options(repr.plot.width=32, repr.plot.height=10)
ggAlignPlots(fragsize_samples, tssprofiles_samples, type = "h")

b) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=20, repr.plot.height=12)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

c) Cell Statistics

minTSS <- c(6, 6, 6)
names(minTSS) <- samples
minFrag <- c(4, 4, 4)
names(minFrag) <- samples
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, set_xlim = c(2, 6.5), set_ylim = c(0, 45), 
                                                           cutoff_feature1 = minFrag[sample], cutoff_feature2 = minTSS[sample])})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, feature2 = "log10(ReadsInTSS)",
                                                           cutoff_feature1 = 3.8, cutoff_feature2 = 3,
                                                           set_xlim = c(2, 6.5), set_ylim = c(0, 6))})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)
Warning message:
“Removed 56 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 11 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 22 rows containing missing values (`geom_point()`).”

3. Filtering I

overview

a) Filtering by minimal number of fragments

Filter cells by minFrag cutoff for a normal distribution of unique fragments per cell.

  • PBMC_scATAC: 10^4.1
  • PBMC_scTurboATAC: 10^4.4

Lower cutoff for PBMC_scATAC to maintain even cell numbers.

minFrag <- c(3.8, 3.8, 3.8)
names(minFrag) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "log10(nFrags)")[,1] >= minFrag[x])], ]})
names(proj_list) <- samples

b) Filtering by minimal TSS enrichment score

Filter cells by minTSS cutoff for a normal distribution of TSS enrichment scores per cell.

  • PBMC_scATAC: 10
  • PBMC_scTurboATAC: 8
minTSS <- c(1000, 1000, 1000)
names(minTSS) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "ReadsInTSS")[,1] > minTSS[x])], ]})
names(proj_list) <- samples
for (sample in samples){
    print(sample)
    nFiltered <- getCellColData(proj_list[[sample]]) %>% nrow(.)
    print(paste0("Filtered cells: ", nFiltered))
}
[1] "PBMC_scATAC"
[1] "Filtered cells: 4563"
[1] "PBMC_scTurboATAC_16"
[1] "Filtered cells: 7405"
[1] "PBMC_scTurboATAC_13"
[1] "Filtered cells: 6690"

c) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=32, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)
stats <- data.frame(c("Sample", "mean(log10(nFrags))", "median(log10(nFrags))",
                                  "mean(log10(ReadsInTSS))", "median(log10(ReadsInTSS))",
                                  "mean(TSSEnrichment)", "median(TSSEnrichment)"))
for (sample in samples){
    stats <- cbind(stats, c(sample,
                            getCellColData(proj_list[[sample]], select = "log10(nFrags)")[,1] %>% mean(.) %>% round(., 2),
                            getCellColData(proj_list[[sample]], select = "log10(nFrags)")[,1] %>% median(.) %>% round(., 2),
                            getCellColData(proj_list[[sample]], select = "log10(ReadsInTSS)")[,1] %>% mean(.) %>% round(., 2),
                            getCellColData(proj_list[[sample]], select = "log10(ReadsInTSS)")[,1] %>% median(.) %>% round(., 2),
                            getCellColData(proj_list[[sample]], select = "TSSEnrichment")[,1] %>% mean(.) %>% round(., 2),
                            getCellColData(proj_list[[sample]], select = "TSSEnrichment")[,1] %>% median(.) %>% round(., 2)))
}
colnames(stats) <- NULL
stats
A data.frame: 7 × 4
Sample PBMC_scATACPBMC_scTurboATAC_16PBMC_scTurboATAC_13
mean(log10(nFrags)) 4.64 4.47 4.57
median(log10(nFrags)) 4.65 4.47 4.56
mean(log10(ReadsInTSS)) 4.33 4.11 4.08
median(log10(ReadsInTSS))4.35 4.16 4.13
mean(TSSEnrichment) 13.75 12.65 9.82
median(TSSEnrichment) 13.45 12.3 9.75

d) Cell Statistics

p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, set_xlim = c(2, 6.5), set_ylim = c(0, 45), 
                                                           cutoff_feature1 = minFrag[sample], cutoff_feature2 = 4)})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, feature2 = "log10(ReadsInTSS)",
                                                           cutoff_feature1 = 3.8, cutoff_feature2 = 3,
                                                           set_xlim = c(2, 6.5), set_ylim = c(0, 6))})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)

e) Separate single-cell embeddings

proj_list <- lapply(proj_list, function(x){addIterativeLSI(
    ArchRProj = x,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30,
    force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, 3))
for (x in samples){
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
    plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:17, 2:17, 2:17)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
    ArchRProj = proj_list[[x]],
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = LSIcomponents[[x]],
    force = TRUE
)})

names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
    ArchRProj = x, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine",
    force = TRUE
)})

4. Doublet detection

overview

a) Amulet

amulet_res <- lapply(samples, function(sample){res <- amulet(inputFiles[sample],
                                                             barcodes = rownames(proj_list[[sample]]@cellColData) %>% gsub(paste0(sample, "#"), "", .),
                                                             regionsToExclude = GRanges(c("M","chrM","MT","X","Y","chrX","chrY"), IRanges(1L, width=10^8)), # excluding repeats, as well as sex and mitochondrial chromosomes
                                                             uniqueFrags = TRUE, # only use unique fragments
                                                             maxFragSize = 1000L, # maximum fragment size to consider
                                                             removeHighOverlapSites = TRUE); # remove sites that have more than two reads in unexpectedly many cells
                                               rownames(res) <- paste0(sample, "#", rownames(res));
                                               colnames(res) <- paste0("Amulet_", colnames(res));
                                               qsave(res, paste0("Amulet_DoubletDetection_", sample, ".qs"));
                                               return(res)})
for (sample in samples){
    proj_list[[sample]]@cellColData <- cbind(proj_list[[sample]]@cellColData, amulet_res[[sample]][match(rownames(proj_list[[sample]]@cellColData), rownames(amulet_res[[sample]])),])
    proj_list[[sample]]@cellColData$Amulet_doublet <- proj_list[[sample]]@cellColData$Amulet_q.value <= 1e-5
}

b) Correlation with sequencing depth

plot_list <- list()

for (sample in samples){
    plot_list[["1"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_q.value))) + ggpointdensity::geom_pointdensity() +
                                    ggtitle(sample) + 
                                    ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
                                    theme_minimal() + theme(text = element_text(size = 15))
    plot_list[["2"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_p.value))) + ggpointdensity::geom_pointdensity() +
                                    ggtitle(sample) + 
                                    ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
                                    theme_minimal() + theme(text = element_text(size = 15))
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggarrange(plotlist = plot_list[["1"]], ncol = length(samples))
ggarrange(plotlist = plot_list[["2"]], ncol = length(samples))

c) Inspect doublets

umap_list <- list()

for (sample in samples){
    umap_list[[sample]] <- plotEmbedding(proj_list[[sample]], embedding = "UMAP", 
                                         colorBy = "cellColData", name = "Amulet_doublet",
                                         pal = c("FALSE" = "grey", "TRUE" = "darkred"), size = 0.5)
}
options(repr.plot.width=25, repr.plot.height=10)
ggarrange(plotlist = umap_list, ncol=length(samples), nrow=1)
for (sample in samples){
    print(sample)
    print(table(proj_list[[sample]]@cellColData$Amulet_doublet))
    print(round(sum(proj_list[[sample]]@cellColData$Amulet_doublet) / nrow(proj_list[[sample]]@cellColData), 2))
}
[1] "PBMC_scATAC"

FALSE  TRUE 
 4247   316 
[1] 0.07
[1] "PBMC_scTurboATAC_16"

FALSE  TRUE 
 7069   336 
[1] 0.05
[1] "PBMC_scTurboATAC_13"

FALSE  TRUE 
 6226   464 
[1] 0.07

5. Filtering II

overview

a) Filtering by doublets

proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[!proj_list[[x]]$Amulet_doublet]]})
names(proj_list) <- samples

b) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=10, repr.plot.height=8)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

c) Cell Statistics

p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, set_xlim = c(2, 6.5), set_ylim = c(0, 45), 
                                                           cutoff_feature1 = minFrag[sample], cutoff_feature2 = 4)})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)

d) Separate single-cell embeddings

proj_list <- lapply(proj_list, function(x){addIterativeLSI(
    ArchRProj = x,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30,
    force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, length(samples)))
for (x in samples){
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
    plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:20, 2:20, 2:20)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
    ArchRProj = proj_list[[x]],
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = LSIcomponents[[x]],
    force = TRUE
)})

names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
    ArchRProj = x, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine",
    force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=20*length(samples))
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=length(samples))

6. Filtering III

overview

a) Filtering by blacklist ratio

for (sample in samples){
    print(sample)
    nFiltered <- getCellColData(proj_list[[sample]]) %>% nrow(.)
    print(paste0("Filtered cells: ", nFiltered))
}
[1] "PBMC_scATAC"
[1] "Filtered cells: 4247"
[1] "PBMC_scTurboATAC_16"
[1] "Filtered cells: 7069"
[1] "PBMC_scTurboATAC_13"
[1] "Filtered cells: 6226"
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "BlacklistRatio")[,1] < 0.01)], ]})
names(proj_list) <- samples
for (sample in samples){
    print(sample)
    nFiltered <- getCellColData(proj_list[[sample]]) %>% nrow(.)
    print(paste0("Filtered cells: ", nFiltered))
}
[1] "PBMC_scATAC"
[1] "Filtered cells: 4196"
[1] "PBMC_scTurboATAC_16"
[1] "Filtered cells: 6965"
[1] "PBMC_scTurboATAC_13"
[1] "Filtered cells: 6178"

b) Filtering by nucleosome ratio

proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "NucleosomeRatio")[,1] < 3)], ]})
names(proj_list) <- samples
for (sample in samples){
    print(sample)
    nFiltered <- getCellColData(proj_list[[sample]]) %>% nrow(.)
    print(paste0("Filtered cells: ", nFiltered))
}
[1] "PBMC_scATAC"
[1] "Filtered cells: 4141"
[1] "PBMC_scTurboATAC_16"
[1] "Filtered cells: 6785"
[1] "PBMC_scTurboATAC_13"
[1] "Filtered cells: 5765"

c) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=10, repr.plot.height=8)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

d) Cell Statistics

p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, set_xlim = c(2, 6.5), set_ylim = c(0, 45), 
                                                           cutoff_feature1 = minFrag[sample], cutoff_feature2 = 4)})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 3, nrow = 1)

e) Separate single-cell embeddings

proj_list <- lapply(proj_list, function(x){addIterativeLSI(
    ArchRProj = x,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30,
    force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, length(samples)))
for (x in samples){
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
    plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:20, 2:20, 2:20)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
    ArchRProj = proj_list[[x]],
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = LSIcomponents[[x]],
    force = TRUE
)})

names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
    ArchRProj = x, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine",
    force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=20*length(samples))
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=length(samples))

7. Clustering of single cells

overview

a) Evaluate different clustering resolutions

umap_clusters_filtered <- lapply(proj_list, function(x){plotUmapClusters(x, reducedDims="IterativeLSI", 
                                           embedding="UMAP", resolutions = c(0.1, 0.2, 0.3))})
options(repr.plot.width=28, repr.plot.height=10*length(samples))
lapply(umap_clusters_filtered, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=1)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=length(samples))

b) Cluster cells

proj_list <- lapply(names(proj_list), function(x){addClusters(proj_list[[x]], 
                                                              reducedDims = "IterativeLSI", dimsToUse = LSIcomponents[[x]], 
                                                              method = "Seurat", name = "Clusters", resolution = 0.3)})
names(proj_list) <- samples
lapply(proj_list, function(x){table(x$Clusters)})
$PBMC_scATAC

  C1   C2   C3   C4   C5   C6   C7   C8   C9 
 321  536  688  173  625  117  406 1089  186 

$PBMC_scTurboATAC_16

  C1  C10  C11   C2   C3   C4   C5   C6   C7   C8   C9 
 151  869  873 1535  364  246   71   43  470 1102 1061 

$PBMC_scTurboATAC_13

  C1  C10  C11   C2   C3   C4   C5   C6   C7   C8   C9 
  69 1331  274  457  259   60 1117  732  771  569  126 

c) Single-cell embeddings

umap_list <- lapply(proj_list, function(x){plotUmaps(x, embeddings = c("UMAP"), coloringLayer = "Clusters")[[1]]})
names(umap_list) <- samples
options(repr.plot.width=8*length(samples), repr.plot.height=10)
ggarrange(plotlist = umap_list, ncol=length(samples), nrow=1)

d) Sample statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = x$Clusters, log10nFrags = log10(x$nFrags), Sample = x$Sample)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + facet_wrap(~Sample) + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12), strip.text = element_text(size = 12)) +
                        scale_fill_manual(values = paletteDiscrete(paste0("C", 1:15))) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = x$Clusters, tss = x$TSSEnrichment, Sample = x$Sample)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12), strip.text = element_text(size = 12)) +
                        scale_fill_manual(values = paletteDiscrete(paste0("C", 1:15))) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score") + facet_wrap(~Sample)
options(repr.plot.width=6*length(samples), repr.plot.height=10)
ggarrange(log10frags_samples, tss_samples, nrow = 2)

8. Save ArchRProject

### Save ArchRProject
for (sample in samples){
    saveArchRProject(ArchRProj = proj_list[[sample]], outputDirectory = paste0("ArchRProj_", sample), load = FALSE)
}
lapply(names(proj_list), function(x){all(rownames(proj_list[[x]]@cellColData) == rownames(proj_list[[x]]@embeddings$UMAP$df)) %>% print(.);
                                     write.csv(cbind(proj_list[[x]]@embeddings$UMAP$df, proj_list[[x]]@cellColData), paste0("RawData_", x, ".csv"))})

#


Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023

sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/bq_ilander/miniconda3/envs/RWireX_v0.2.02/lib/libopenblasp-r0.3.21.so

Random number generation:
 RNG:     L'Ecuyer-CMRG 
 Normal:  Inversion 
 Sample:  Rejection 
 
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] parallel  stats4    grid      stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] uwot_0.1.14                       BSgenome.Hsapiens.UCSC.hg38_1.4.4
 [3] BSgenome_1.66.3                   rtracklayer_1.58.0               
 [5] Biostrings_2.66.0                 XVector_0.38.0                   
 [7] qs_0.25.4                         scDblFinder_1.12.0               
 [9] SingleCellExperiment_1.20.0       SeuratObject_4.1.3               
[11] Seurat_4.3.0                      purrr_1.0.1                      
[13] ggpubr_0.4.0                      plotgardener_1.4.1               
[15] ggExtra_0.10.0                    RWireX_0.2.03                    
[17] rhdf5_2.42.0                      SummarizedExperiment_1.28.0      
[19] Biobase_2.58.0                    RcppArmadillo_0.12.2.0.0         
[21] Rcpp_1.0.10                       Matrix_1.5-4                     
[23] GenomicRanges_1.50.1              GenomeInfoDb_1.34.9              
[25] IRanges_2.32.0                    S4Vectors_0.36.0                 
[27] BiocGenerics_0.44.0               sparseMatrixStats_1.10.0         
[29] MatrixGenerics_1.10.0             matrixStats_0.63.0               
[31] data.table_1.14.8                 stringr_1.5.0                    
[33] plyr_1.8.8                        ggplot2_3.4.2                    
[35] gtable_0.3.3                      gtools_3.9.4                     
[37] gridExtra_2.3                     devtools_2.4.5                   
[39] usethis_2.1.6                     ArchR_1.0.3                      
[41] magrittr_2.0.3                   

loaded via a namespace (and not attached):
  [1] pbdZMQ_0.3-9              scattermore_1.0          
  [3] ragg_1.2.5                tidyr_1.3.0              
  [5] irlba_2.3.5.1             DelayedArray_0.24.0      
  [7] RCurl_1.98-1.9            generics_0.1.3           
  [9] ScaledMatrix_1.6.0        callr_3.7.3              
 [11] cowplot_1.1.1             RApiSerialize_0.1.2      
 [13] RANN_2.6.1                future_1.32.0            
 [15] ggpointdensity_0.1.0      spatstat.data_3.0-1      
 [17] httpuv_1.6.10             viridis_0.6.3            
 [19] evaluate_0.21             promises_1.2.0.1         
 [21] fansi_1.0.4               restfulr_0.0.15          
 [23] igraph_1.4.2              htmlwidgets_1.6.2        
 [25] spatstat.geom_3.2-1       ellipsis_0.3.2           
 [27] dplyr_1.1.2               backports_1.4.1          
 [29] RcppParallel_5.1.6        deldir_1.0-6             
 [31] vctrs_0.6.2               remotes_2.4.2            
 [33] Cairo_1.6-0               ROCR_1.0-11              
 [35] abind_1.4-5               cachem_1.0.8             
 [37] withr_2.5.0               progressr_0.13.0         
 [39] sctransform_0.3.5         GenomicAlignments_1.34.0 
 [41] strawr_0.0.9              prettyunits_1.1.1        
 [43] scran_1.26.0              goftest_1.2-3            
 [45] cluster_2.1.4             IRdisplay_1.1            
 [47] lazyeval_0.2.2            crayon_1.5.2             
 [49] spatstat.explore_3.1-0    edgeR_3.40.0             
 [51] pkgconfig_2.0.3           labeling_0.4.2           
 [53] nlme_3.1-162              vipor_0.4.5              
 [55] pkgload_1.3.2             rlang_1.1.1              
 [57] globals_0.16.2            lifecycle_1.0.3          
 [59] miniUI_0.1.1.1            rsvd_1.0.5               
 [61] polyclip_1.10-4           lmtest_0.9-40            
 [63] IRkernel_1.3.1            carData_3.0-5            
 [65] Rhdf5lib_1.20.0           zoo_1.8-12               
 [67] base64enc_0.1-3           beeswarm_0.4.0           
 [69] ggridges_0.5.4            processx_3.8.1           
 [71] png_0.1-8                 viridisLite_0.4.2        
 [73] rjson_0.2.21              stringfish_0.15.7        
 [75] bitops_1.0-7              KernSmooth_2.23-21       
 [77] rhdf5filters_1.10.0       DelayedMatrixStats_1.20.0
 [79] parallelly_1.35.0         spatstat.random_3.1-4    
 [81] rstatix_0.7.2             gridGraphics_0.5-1       
 [83] ggsignif_0.6.4            beachmat_2.14.0          
 [85] scales_1.2.1              memoise_2.0.1            
 [87] ica_1.0-3                 zlibbioc_1.44.0          
 [89] compiler_4.2.2            dqrng_0.3.0              
 [91] BiocIO_1.8.0              RColorBrewer_1.1-3       
 [93] fitdistrplus_1.1-11       Rsamtools_2.14.0         
 [95] cli_3.6.1                 urlchecker_1.0.1         
 [97] listenv_0.9.0             patchwork_1.1.2          
 [99] pbapply_1.7-0             ps_1.7.5                 
[101] MASS_7.3-60               tidyselect_1.2.0         
[103] stringi_1.7.12            textshaping_0.3.6        
[105] yaml_2.3.7                BiocSingular_1.14.0      
[107] locfit_1.5-9.7            ggrepel_0.9.3            
[109] tools_4.2.2               future.apply_1.10.0      
[111] uuid_1.1-0                bluster_1.8.0            
[113] metapod_1.6.0             plyranges_1.18.0         
[115] farver_2.1.1              Rtsne_0.16               
[117] digest_0.6.31             shiny_1.7.4              
[119] car_3.1-2                 broom_1.0.4              
[121] scuttle_1.8.0             later_1.3.1              
[123] RcppAnnoy_0.0.20          httr_1.4.6               
[125] colorspace_2.1-0          XML_3.99-0.14            
[127] fs_1.6.2                  tensor_1.5               
[129] reticulate_1.26           splines_4.2.2            
[131] yulab.utils_0.0.6         statmod_1.5.0            
[133] spatstat.utils_3.0-3      scater_1.26.0            
[135] sp_1.6-0                  xgboost_1.7.4.1          
[137] ggplotify_0.1.0           systemfonts_1.0.4        
[139] plotly_4.10.1             sessioninfo_1.2.2        
[141] xtable_1.8-4              jsonlite_1.8.4           
[143] R6_2.5.1                  profvis_0.3.8            
[145] pillar_1.9.0              htmltools_0.5.5          
[147] mime_0.12                 glue_1.6.2               
[149] fastmap_1.1.1             BiocParallel_1.32.5      
[151] BiocNeighbors_1.16.0      codetools_0.2-19         
[153] pkgbuild_1.4.0            utf8_1.2.3               
[155] lattice_0.21-8            spatstat.sparse_3.0-1    
[157] tibble_3.2.1              curl_4.3.3               
[159] ggbeeswarm_0.7.2          leiden_0.4.3             
[161] survival_3.5-5            limma_3.54.0             
[163] repr_1.1.6                munsell_0.5.0            
[165] GenomeInfoDbData_1.2.9    reshape2_1.4.4